#pip install xgboost
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from pandas.plotting import table
import plotly.express as px
import matplotlib.ticker as mtick
import plotly.io as pio
pio.renderers.default = 'notebook_connected'
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from prophet import Prophet
q_data= pd.read_csv("qualitative_data_msba.csv")
t_data=pd.read_csv("time_series_data_msba.csv")
q_data.head()
t_data.head()
q_data= q_data.drop("Unnamed: 0", axis=1)
t_data= t_data.drop("Unnamed: 0", axis=1)
q_data.head()
t_data.head()
q_data.shape
t_data.shape
q_data[['square_feet','x1_mile_pop','x1_mile_emp','x1_mile_income','x1_2_mile_pop','x1_2_mile_emp']].describe()
t_data[['daily_yoy_ndt.total_inside_sales', 'daily_yoy_ndt.total_food_service', 'diesel', 'unleaded']].describe()
q_data.drop(['front_door_count','godfather_s_pizza','car_wash','ev_charging','non_24_hour','self_check_out','diesel'], axis=1,inplace=True)
q_data.shape
na_values = q_data.isna().sum()
na_values[na_values > 0]
cat_col= q_data.select_dtypes(include="object").columns.tolist()
cat_col
columns_with_na = q_data.columns[q_data.isna().any()].tolist()
for column in columns_with_na:
mode_value = q_data[column].mode()[0]
q_data[column].fillna("None", inplace=True)
for column in cat_col:
q_data[column] = pd.factorize(q_data[column])[0]
import pandas as pd
# Define bins and labels
bins = [20, 30, 40, 50]
labels = [1, 2, 3]
# Bin the values in the 'parking_spaces' column
q_data['parking_spaces'] = pd.cut(q_data['parking_spaces'], bins=bins, labels=labels, include_lowest=True).astype(int)
q_data.head()
na_values = t_data.isna().sum()
na_values[na_values > 0]
t_data['calendar.calendar_day_date'] = pd.to_datetime(t_data['calendar.calendar_day_date'])
t_data['calendar_year'] = t_data['calendar.calendar_day_date'].dt.year
t_data['calendar_month'] = t_data['calendar.calendar_day_date'].dt.month
t_data['calendar_day'] = t_data['calendar.calendar_day_date'].dt.day
t_data.head()
t_data['calendar.calendar_day_date'] = pd.to_datetime(t_data['calendar.calendar_day_date'])
t_data.set_index('calendar.calendar_day_date', inplace=True)
average_diesel_sales = t_data.groupby(t_data.index)['diesel'].mean()
average_unleaded_sales = t_data.groupby(t_data.index)['unleaded'].mean()
average_sales_df = pd.DataFrame({'Diesel': average_diesel_sales, 'Unleaded': average_unleaded_sales})
fig = px.line(average_sales_df, x=average_sales_df.index, y=average_sales_df.columns,
title='Average Diesel and Unleaded Sales Over Time')
fig.update_xaxes(title='Date', rangeslider_visible=True)
fig.update_yaxes(title='Average Sales')
fig.show()
average_inside_sales = t_data.groupby(t_data.index)['daily_yoy_ndt.total_inside_sales'].mean()
average_food_sales = t_data.groupby(t_data.index)['daily_yoy_ndt.total_food_service'].mean()
average_sales_df = pd.DataFrame({'daily_yoy_ndt.total_inside_sales': average_inside_sales, 'daily_yoy_ndt.total_food_service': average_food_sales})
fig = px.line(average_sales_df, x=average_sales_df.index, y=average_sales_df.columns,
title='Average Inside and Food Sales Over Time')
fig.update_xaxes(title='Date', rangeslider_visible=True)
fig.update_yaxes(title='Average Sales')
fig.show()
t_data['calendar.fiscal_week_id_for_year'].min(), t_data['calendar.fiscal_week_id_for_year'].max()
df_time_aggregate = t_data.groupby('calendar.fiscal_week_id_for_year')['daily_yoy_ndt.total_inside_sales', 'daily_yoy_ndt.total_food_service', 'diesel', 'unleaded'].sum().reset_index()
temp = t_data.groupby('calendar.fiscal_week_id_for_year')['daily_yoy_ndt.total_inside_sales'].size()
temp[temp > 1].sort_values(ascending=False)
df_time_aggregate = df_time_aggregate.set_index('calendar.fiscal_week_id_for_year')
df_time_aggregate.index
plt.figure(figsize=(40, 50));
df_time_aggregate.plot();
for year in [2021, 2022, 2023]:
sales_year = t_data[t_data['calendar_year'] == year]
pivot_table = sales_year.pivot_table(index='calendar_information.type_of_day',
values=['diesel', 'unleaded'],
aggfunc='sum')
ax = pivot_table.plot(kind='bar', figsize=(10, 6))
plt.title(f'Diesel vs. Unleaded Sales by Type of Day - {year}')
plt.xlabel('Type of Day')
plt.ylabel('Total Sales')
ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, p: format(int(x), ',')))
ax.set_xticklabels(pivot_table.index, rotation=0)
plt.legend(['Diesel', 'Unleaded'])
plt.show()
for year in [2021, 2022, 2023]:
sales_year = t_data[t_data['calendar_year'] == year]
pivot_table = sales_year.pivot_table(index='calendar_information.type_of_day',
values=['daily_yoy_ndt.total_inside_sales', 'daily_yoy_ndt.total_food_service'],
aggfunc='sum')
ax = pivot_table.plot(kind='bar', figsize=(10, 6))
plt.title(f'Merchandise vs. Food Sales by Type of Day - {year}')
plt.xlabel('Type of Day')
plt.ylabel('Total Sales')
ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, p: format(int(x), ',')))
#2021
t_data_2021 = t_data[t_data['calendar_year'] == 2021]
sales_by_holiday_2021 = t_data_2021.groupby('calendar_information.holiday')[['diesel', 'unleaded']].sum()
sales_by_holiday_2021 = sales_by_holiday_2021[sales_by_holiday_2021.index != 'NONE']
holidays_2021 = sales_by_holiday_2021.index.tolist()
diesel_sales_2021 = sales_by_holiday_2021['diesel'].tolist()
unleaded_sales_2021 = sales_by_holiday_2021['unleaded'].tolist()
plt.figure(figsize=(15, 6))
bar_width = 0.35
r1 = range(len(holidays_2021))
r2 = [x + bar_width for x in r1]
plt.bar(r1, diesel_sales_2021, color='blue', width=bar_width, edgecolor='grey', label='Diesel')
plt.bar(r2, unleaded_sales_2021, color='orange', width=bar_width, edgecolor='grey', label='Unleaded')
plt.xlabel('Holiday')
plt.ylabel('Total Sales')
plt.title('Total Diesel and Unleaded Sales on Each Holiday in 2021')
plt.xticks([r + bar_width/2 for r in r1], holidays_2021, rotation=90)
plt.legend()
plt.show()
#2022
t_data_2022 = t_data[t_data['calendar_year'] == 2022]
sales_by_holiday_2022 = t_data_2022.groupby('calendar_information.holiday')[['diesel', 'unleaded']].sum()
sales_by_holiday_2022 = sales_by_holiday_2022[sales_by_holiday_2022.index != 'NONE']
holidays_2022 = sales_by_holiday_2022.index.tolist()
diesel_sales_2022 = sales_by_holiday_2022['diesel'].tolist()
unleaded_sales_2022 = sales_by_holiday_2022['unleaded'].tolist()
plt.figure(figsize=(15, 6))
bar_width = 0.35
r1 = range(len(holidays_2022))
r2 = [x + bar_width for x in r1]
plt.bar(r1, diesel_sales_2022, color='blue', width=bar_width, edgecolor='grey', label='Diesel')
plt.bar(r2, unleaded_sales_2022, color='orange', width=bar_width, edgecolor='grey', label='Unleaded')
plt.xlabel('Holiday')
plt.ylabel('Total Sales')
plt.title('Total Diesel and Unleaded Sales on Each Holiday in 2022')
plt.xticks([r + bar_width/2 for r in r1], holidays_2022, rotation=90)
plt.legend()
plt.show()
#2023
t_data_2023 = t_data[t_data['calendar_year'] == 2023]
sales_by_holiday_2023 = t_data_2023.groupby('calendar_information.holiday')[['diesel', 'unleaded']].sum()
sales_by_holiday_2023 = sales_by_holiday_2023[sales_by_holiday_2023.index != 'NONE']
holidays_2023 = sales_by_holiday_2023.index.tolist()
diesel_sales_2023 = sales_by_holiday_2023['diesel'].tolist()
unleaded_sales_2023 = sales_by_holiday_2023['unleaded'].tolist()
plt.figure(figsize=(15, 6))
bar_width = 0.35
r1 = range(len(holidays_2023))
r2 = [x + bar_width for x in r1]
plt.bar(r1, diesel_sales_2023, color='blue', width=bar_width, edgecolor='grey', label='Diesel')
plt.bar(r2, unleaded_sales_2023, color='orange', width=bar_width, edgecolor='grey', label='Unleaded')
plt.xlabel('Holiday')
plt.ylabel('Total Sales')
plt.title('Total Diesel and Unleaded Sales on Each Holiday in 2023')
plt.xticks([r + bar_width/2 for r in r1], holidays_2023, rotation=90)
plt.legend()
plt.show()
#2021
t_data_2021 = t_data[t_data['calendar_year'] == 2021]
sales_by_holiday_2021 = t_data_2021.groupby('calendar_information.holiday')[['daily_yoy_ndt.total_inside_sales', 'daily_yoy_ndt.total_food_service']].sum()
sales_by_holiday_2021 = sales_by_holiday_2021[sales_by_holiday_2021.index != 'NONE']
holidays_2021 = sales_by_holiday_2021.index.tolist()
inStore_sales_2021 = sales_by_holiday_2021['daily_yoy_ndt.total_inside_sales'].tolist()
Food_sales_2021 = sales_by_holiday_2021['daily_yoy_ndt.total_food_service'].tolist()
plt.figure(figsize=(15, 6))
bar_width = 0.35
r1 = range(len(holidays_2021))
r2 = [x + bar_width for x in r1]
plt.bar(r1, inStore_sales_2021, color='blue', width=bar_width, edgecolor='grey', label='InStore')
plt.bar(r2, Food_sales_2021, color='orange', width=bar_width, edgecolor='grey', label='Food')
plt.xlabel('Holiday')
plt.ylabel('Total Sales')
plt.title('Total InStore and Food Sales on Each Holiday in 2021')
plt.xticks([r + bar_width/2 for r in r1], holidays_2021, rotation=90)
plt.legend()
plt.show()
#2022
t_data_2022 = t_data[t_data['calendar_year'] == 2022]
sales_by_holiday_2022 = t_data_2022.groupby('calendar_information.holiday')[['daily_yoy_ndt.total_inside_sales', 'daily_yoy_ndt.total_food_service']].sum()
sales_by_holiday_2022 = sales_by_holiday_2022[sales_by_holiday_2022.index != 'NONE']
holidays_2022 = sales_by_holiday_2022.index.tolist()
inStore_sales_2022 = sales_by_holiday_2022['daily_yoy_ndt.total_inside_sales'].tolist()
Food_sales_2022 = sales_by_holiday_2022['daily_yoy_ndt.total_food_service'].tolist()
plt.figure(figsize=(15, 6))
bar_width = 0.35
r1 = range(len(holidays_2022))
r2 = [x + bar_width for x in r1]
plt.bar(r1, inStore_sales_2022, color='blue', width=bar_width, edgecolor='grey', label='InStore')
plt.bar(r2, Food_sales_2022, color='orange', width=bar_width, edgecolor='grey', label='Food')
plt.xlabel('Holiday')
plt.ylabel('Total Sales')
plt.title('Total InStore and Food Sales on Each Holiday in 2022')
plt.xticks([r + bar_width/2 for r in r1], holidays_2022, rotation=90)
plt.legend()
plt.show()
#2023
t_data_2023 = t_data[t_data['calendar_year'] == 2023]
sales_by_holiday_2023 = t_data_2023.groupby('calendar_information.holiday')[['daily_yoy_ndt.total_inside_sales', 'daily_yoy_ndt.total_food_service']].sum()
sales_by_holiday_2023 = sales_by_holiday_2023[sales_by_holiday_2023.index != 'NONE']
holidays_2023 = sales_by_holiday_2023.index.tolist()
inStore_sales_2023 = sales_by_holiday_2023['daily_yoy_ndt.total_inside_sales'].tolist()
Food_sales_2023 = sales_by_holiday_2023['daily_yoy_ndt.total_food_service'].tolist()
plt.figure(figsize=(15, 6))
bar_width = 0.35
r1 = range(len(holidays_2023))
r2 = [x + bar_width for x in r1]
plt.bar(r1, inStore_sales_2023, color='blue', width=bar_width, edgecolor='grey', label='InStore')
plt.bar(r2, Food_sales_2023, color='orange', width=bar_width, edgecolor='grey', label='Food')
plt.xlabel('Holiday')
plt.ylabel('Total Sales')
plt.title('Total InStore and Food Sales on Each Holiday in 2021')
plt.xticks([r + bar_width/2 for r in r1], holidays_2023, rotation=90)
plt.legend()
plt.show()
average_sales_2021 = t_data_2021.groupby('calendar_month')['daily_yoy_ndt.total_inside_sales'].mean()
average_sales_2022 = t_data_2022.groupby('calendar_month')['daily_yoy_ndt.total_inside_sales'].mean()
average_sales_2023 = t_data_2023.groupby('calendar_month')['daily_yoy_ndt.total_inside_sales'].mean()
months = range(1,13)
months_2023 = range(1, 9)
plt.figure(figsize=(10, 6))
plt.plot(months, average_sales_2021, label='2021')
plt.plot(months, average_sales_2022, label='2022')
plt.plot(months_2023, average_sales_2023, label='2023')
plt.xlabel('Month')
plt.ylabel('Average Sales')
plt.title('Average Inside Sales per Month (2021-2023)')
plt.legend()
plt.show()
average_sales_2021 = t_data_2021.groupby('calendar_month')['daily_yoy_ndt.total_food_service'].mean()
average_sales_2022 = t_data_2022.groupby('calendar_month')['daily_yoy_ndt.total_food_service'].mean()
average_sales_2023 = t_data_2023.groupby('calendar_month')['daily_yoy_ndt.total_food_service'].mean()
months_2023 = range(1, 9)
plt.figure(figsize=(10, 6))
plt.plot(months, average_sales_2021, label='2021')
plt.plot(months, average_sales_2022, label='2022')
plt.plot(months_2023, average_sales_2023, label='2023')
plt.xlabel('Month')
plt.ylabel('Average Sales')
plt.title('Average Food Sales per Month (2021-2023)')
plt.legend()
plt.show()
average_sales_2021 = t_data_2021.groupby('calendar_month')['diesel'].mean()
average_sales_2022 = t_data_2022.groupby('calendar_month')['diesel'].mean()
average_sales_2023 = t_data_2023.groupby('calendar_month')['diesel'].mean()
months_2023 = range(1, 9)
plt.figure(figsize=(10, 6))
plt.plot(months, average_sales_2021, label='2021')
plt.plot(months, average_sales_2022, label='2022')
plt.plot(months_2023, average_sales_2023, label='2023')
plt.xlabel('Month')
plt.ylabel('Average Sales')
plt.title('Average Diesel Sales per Month (2021-2023)')
plt.legend()
plt.show()
average_sales_2021 = t_data_2021.groupby('calendar_month')['unleaded'].mean()
average_sales_2022 = t_data_2022.groupby('calendar_month')['unleaded'].mean()
average_sales_2023 = t_data_2023.groupby('calendar_month')['unleaded'].mean()
months_2023 = range(1, 9)
plt.figure(figsize=(10, 6))
plt.plot(months, average_sales_2021, label='2021')
plt.plot(months, average_sales_2022, label='2022')
plt.plot(months_2023, average_sales_2023, label='2023')
plt.xlabel('Month')
plt.ylabel('Average Sales')
plt.title('Average unleaded Sales per Month (2021-2023)')
plt.legend()
plt.show()
def get_season(month):
if month in [12, 1, 2]:
return 'Winter'
elif month in [3, 4, 5]:
return 'Spring'
elif month in [6, 7, 8]:
return 'Summer'
else:
return 'Fall'
t_data['season'] = t_data['calendar_month'].apply(get_season)
t_data_2021 = t_data[t_data['calendar_year'] == 2021].copy()
t_data_2022 = t_data[t_data['calendar_year'] == 2022].copy()
t_data_2023 = t_data[t_data['calendar_year'] == 2023].copy()
sales_by_season_2021 = t_data_2021.groupby('season')[['daily_yoy_ndt.total_inside_sales']].sum()
sales_by_season_2022 = t_data_2022.groupby('season')[['daily_yoy_ndt.total_inside_sales']].sum()
sales_by_season_2023 = t_data_2023.groupby('season')[['daily_yoy_ndt.total_inside_sales']].sum()
plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
plt.pie(sales_by_season_2021['daily_yoy_ndt.total_inside_sales'], labels=sales_by_season_2021.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribution of Inside Sales by Season in 2021')
plt.subplot(1, 3, 2)
plt.pie(sales_by_season_2022['daily_yoy_ndt.total_inside_sales'], labels=sales_by_season_2022.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribution of Inside Sales by Season in 2022')
plt.subplot(1, 3, 3)
plt.pie(sales_by_season_2023['daily_yoy_ndt.total_inside_sales'], labels=sales_by_season_2023.index, autopct='%1.1f%%', startangle=140, colors=['#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribution of Inside Sales by Season in 2023')
plt.tight_layout()
sales_table_2021 = sales_by_season_2021.reset_index()
sales_table_2021.columns = ['Season', 'Total Sales (2021)']
sales_table_2022 = sales_by_season_2022.reset_index()
sales_table_2022.columns = ['Season', 'Total Sales (2022)']
sales_table_2023 = sales_by_season_2023.reset_index()
sales_table_2023.columns = ['Season', 'Total Sales (2023)']
with pd.option_context('display.float_format', '{:,.2f}'.format):
print("Inside Sales for 2021:")
display(sales_table_2021)
print("\n Inside Sales for 2022:")
display(sales_table_2022)
print("\n Inside Sales for 2023:")
display(sales_table_2023)
t_data['season'] = t_data['calendar_month'].apply(get_season)
t_data_2021 = t_data[t_data['calendar_year'] == 2021].copy()
t_data_2022 = t_data[t_data['calendar_year'] == 2022].copy()
t_data_2023 = t_data[t_data['calendar_year'] == 2023].copy()
sales_by_season_2021 = t_data_2021.groupby('season')[['daily_yoy_ndt.total_food_service']].sum()
sales_by_season_2022 = t_data_2022.groupby('season')[['daily_yoy_ndt.total_food_service']].sum()
sales_by_season_2023 = t_data_2023.groupby('season')[['daily_yoy_ndt.total_food_service']].sum()
plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
plt.pie(sales_by_season_2021['daily_yoy_ndt.total_food_service'], labels=sales_by_season_2021.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribution of Food Sales by Season in 2021')
plt.subplot(1, 3, 2)
plt.pie(sales_by_season_2022['daily_yoy_ndt.total_food_service'], labels=sales_by_season_2022.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribution of Food Sales by Season in 2022')
plt.subplot(1, 3, 3)
plt.pie(sales_by_season_2023['daily_yoy_ndt.total_food_service'], labels=sales_by_season_2023.index, autopct='%1.1f%%', startangle=140, colors=['#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribution of Food Sales by Season in 2023')
plt.tight_layout()
sales_table_2021 = sales_by_season_2021.reset_index()
sales_table_2021.columns = ['Season', 'Total Sales (2021)']
sales_table_2022 = sales_by_season_2022.reset_index()
sales_table_2022.columns = ['Season', 'Total Sales (2022)']
sales_table_2023 = sales_by_season_2023.reset_index()
sales_table_2023.columns = ['Season', 'Total Sales (2023)']
with pd.option_context('display.float_format', '{:,.2f}'.format):
print("Food Sales for 2021:")
display(sales_table_2021)
print("\n Food Sales for 2022:")
display(sales_table_2022)
print("\n Food Sales for 2023:")
display(sales_table_2023)
def get_season(month):
if month in [12, 1, 2]:
return 'Winter'
elif month in [3, 4, 5]:
return 'Spring'
elif month in [6, 7, 8]:
return 'Summer'
else:
return 'Fall'
t_data['season'] = t_data['calendar_month'].apply(get_season)
t_data_2021 = t_data[t_data['calendar_year'] == 2021].copy()
t_data_2022 = t_data[t_data['calendar_year'] == 2022].copy()
t_data_2023 = t_data[t_data['calendar_year'] == 2023].copy()
sales_by_season_2021 = t_data_2021.groupby('season')[['diesel']].sum()
sales_by_season_2022 = t_data_2022.groupby('season')[['diesel']].sum()
sales_by_season_2023 = t_data_2023.groupby('season')[['diesel']].sum()
plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
plt.pie(sales_by_season_2021['diesel'], labels=sales_by_season_2021.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribution of Sales by Season in 2021')
plt.subplot(1, 3, 2)
plt.pie(sales_by_season_2022['diesel'], labels=sales_by_season_2022.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribution of Sales by Season in 2022')
plt.subplot(1, 3, 3)
plt.pie(sales_by_season_2023['diesel'], labels=sales_by_season_2023.index, autopct='%1.1f%%', startangle=140, colors=['#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribution of Sales by Season in 2023')
plt.tight_layout()
sales_table_2021 = sales_by_season_2021.reset_index()
sales_table_2021.columns = ['Season', 'Total Sales (2021)']
sales_table_2022 = sales_by_season_2022.reset_index()
sales_table_2022.columns = ['Season', 'Total Sales (2022)']
sales_table_2023 = sales_by_season_2023.reset_index()
sales_table_2023.columns = ['Season', 'Total Sales (2023)']
with pd.option_context('display.float_format', '{:,.2f}'.format):
print("Sales for 2021:")
display(sales_table_2021)
print("\nSales for 2022:")
display(sales_table_2022)
print("\nSales for 2023:")
display(sales_table_2023)
t_data['season'] = t_data['calendar_month'].apply(get_season)
t_data_2021 = t_data[t_data['calendar_year'] == 2021].copy()
t_data_2022 = t_data[t_data['calendar_year'] == 2022].copy()
t_data_2023 = t_data[t_data['calendar_year'] == 2023].copy()
sales_by_season_2021 = t_data_2021.groupby('season')[['unleaded']].sum()
sales_by_season_2022 = t_data_2022.groupby('season')[['unleaded']].sum()
sales_by_season_2023 = t_data_2023.groupby('season')[['unleaded']].sum()
plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
plt.pie(sales_by_season_2021['unleaded'], labels=sales_by_season_2021.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribution of Sales by Season in 2021')
plt.subplot(1, 3, 2)
plt.pie(sales_by_season_2022['unleaded'], labels=sales_by_season_2022.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribution of Sales by Season in 2022')
plt.subplot(1, 3, 3)
plt.pie(sales_by_season_2023['unleaded'], labels=sales_by_season_2023.index, autopct='%1.1f%%', startangle=140, colors=['#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribution of Sales by Season in 2023')
plt.tight_layout()
sales_table_2021 = sales_by_season_2021.reset_index()
sales_table_2021.columns = ['Season', 'Total Sales (2021)']
sales_table_2022 = sales_by_season_2022.reset_index()
sales_table_2022.columns = ['Season', 'Total Sales (2022)']
sales_table_2023 = sales_by_season_2023.reset_index()
sales_table_2023.columns = ['Season', 'Total Sales (2023)']
with pd.option_context('display.float_format', '{:,.2f}'.format):
print("Sales for 2021:")
display(sales_table_2021)
print("\nSales for 2022:")
display(sales_table_2022)
print("\nSales for 2023:")
display(sales_table_2023)
grouped_data = t_data.groupby('site_id_msba').agg({
'daily_yoy_ndt.total_inside_sales': 'sum',
'daily_yoy_ndt.total_food_service': 'sum',
'diesel': 'sum',
'unleaded': 'sum'
}).reset_index()
grouped_data.set_index('site_id_msba', inplace=True)
ax = grouped_data.plot(kind='bar', figsize=(12, 8), width=0.8)
plt.xlabel('site_id_msba')
plt.ylabel('Total Sum')
plt.title('Total Sum of Sales and Fuel Types by site_id_msba')
plt.legend(title='Categories')
plt.tight_layout()
plt.show()
df_merged=pd.merge(t_data,q_data, on='site_id_msba', how='left')
df_merged.shape
df_merged.head()
plt.scatter(df_merged['x1_mile_pop'], df_merged['x1_mile_income'])
plt.xlabel('Population within 1-mile radius')
plt.ylabel('Median income of 1-mile radius population')
plt.title('Scatter Plot: 1-mile Population vs. 1-mile Income')
plt.show()
plt.scatter(df_merged['x1_2_mile_pop'], df_merged['x1_2_mile_income'])
plt.xlabel('Population within 1/2-mile radius')
plt.ylabel('Median income of 1/2-mile radius population')
plt.title('Scatter Plot: 1/2-mile Population vs. 1/2-mile Income')
plt.show()
plt.scatter(df_merged['x5_min_pop'], df_merged['x5_min_inc'])
plt.xlabel('Population within 5-minute radius')
plt.ylabel('Median income of 5-minute radius population')
plt.title('Scatter Plot: 5-minute Population vs. 5-minute Income')
plt.show()
plt.scatter(df_merged['x7_min_pop'], df_merged['x7_min_inc'])
plt.xlabel('Population within 7-minute radius')
plt.ylabel('Median income of 7-minute radius population')
plt.title('Scatter Plot: 7-minute Population vs. 7-minute Income')
plt.show()
df_merged['total_sales'] = df_merged['daily_yoy_ndt.total_food_service'] + df_merged['daily_yoy_ndt.total_inside_sales']
total_sales_1_mile = df_merged[df_merged['x1_mile_pop'] > 0]['total_sales'].sum()
total_sales_2_mile = df_merged[df_merged['x1_2_mile_pop'] > 0]['total_sales'].sum()
total_sales_5_min = df_merged[df_merged['x5_min_pop'] > 0]['total_sales'].sum()
total_sales_7_min = df_merged[df_merged['x7_min_pop'] > 0]['total_sales'].sum()
labels = ['1 Mile', '2 Miles', '5 Minutes', '7 Minutes']
values = [total_sales_1_mile, total_sales_2_mile, total_sales_5_min, total_sales_7_min]
plt.figure(figsize=(8, 8))
plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title('Contribution of Total Sales by Population Radius')
plt.show()
income_bins = [20000, 30000, 50000, 70000, 90000, 110000, 1000000]
income_labels = ['20k-30k', '30k-50k', '50k-70k', '70k-90k', '90k-110k', '100k+']
df_merged['x1_mile_income_bin'] = pd.cut(df_merged['x1_mile_income'], bins=income_bins, labels=income_labels, right=False)
df_merged['x1_2_mile_income_bin'] = pd.cut(df_merged['x1_2_mile_income'], bins=income_bins, labels=income_labels, right=False)
df_merged['x5_min_inc_bin'] = pd.cut(df_merged['x5_min_inc'], bins=income_bins, labels=income_labels, right=False)
df_merged['x7_min_inc_bin'] = pd.cut(df_merged['x7_min_inc'], bins=income_bins, labels=income_labels, right=False)
df_merged['total_sales'] = df_merged['daily_yoy_ndt.total_inside_sales'] + df_merged['daily_yoy_ndt.total_food_service']
total_sales_by_income_x1_mile = df_merged.groupby('x1_mile_income_bin')['total_sales'].sum()
total_sales_by_income_x1_2_mile = df_merged.groupby('x1_2_mile_income_bin')['total_sales'].sum()
total_sales_by_income_x5_min = df_merged.groupby('x5_min_inc_bin')['total_sales'].sum()
total_sales_by_income_x7_min = df_merged.groupby('x7_min_inc_bin')['total_sales'].sum()
plt.figure(figsize=(12, 8))
plt.subplot(221)
total_sales_by_income_x1_mile.plot(kind='bar')
plt.title('Total Sales by x1_mile_income_bin')
plt.xlabel('Income Range')
plt.ylabel('Total Sales')
plt.subplot(222)
total_sales_by_income_x1_2_mile.plot(kind='bar')
plt.title('Total Sales by x1_2_mile_income_bin')
plt.xlabel('Income Range')
plt.ylabel('Total Sales')
plt.subplot(223)
total_sales_by_income_x5_min.plot(kind='bar')
plt.title('Total Sales by x5_min_inc_bin')
plt.xlabel('Income Range')
plt.ylabel('Total Sales')
plt.subplot(224)
total_sales_by_income_x7_min.plot(kind='bar')
plt.title('Total Sales by x7_min_inc_bin')
plt.xlabel('Income Range')
plt.ylabel('Total Sales')
plt.tight_layout()
plt.show()
sells_items = df_merged[(df_merged['freal'] > 0) | (df_merged['bonfire_grill'] > 0) | (df_merged['pizza'] > 0) | (df_merged['cinnabon'] > 0)]
does_not_sell_items = df_merged[(df_merged['freal'] == 0) & (df_merged['bonfire_grill'] == 0) & (df_merged['pizza'] == 0) & (df_merged['cinnabon'] == 0)]
avg_food_service_sells_items = sells_items['daily_yoy_ndt.total_food_service'].mean()
avg_food_service_does_not_sell_items = does_not_sell_items['daily_yoy_ndt.total_food_service'].mean()
store_type_counts = [len(sells_items), len(does_not_sell_items)]
labels = ['Sells Items', 'Does Not Sell Items']
plt.figure(figsize=(8, 6))
plt.pie(store_type_counts, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Store Types')
plt.show()
columns_to_visualize = ['cat_scales', 'propane', 'traditional_forecourt_fueling_positions',
'traditional_forecourt_layout', 'traditional_forecourt_stack_type',
'hi_flow_rv_lanes_layout', 'hi_flow_rv_lanes_stack_type']
for column in columns_to_visualize:
sales_by_column = df_merged.groupby(column)[['diesel', 'unleaded']].sum()
sales_by_column.plot(kind='bar', figsize=(10, 6))
plt.title(f'Sales of Diesel and Unleaded by {column}')
plt.xlabel(column)
plt.ylabel('Total Sales')
plt.xticks(rotation=0)
plt.legend(['Diesel', 'Unleaded'])
plt.show()
### comment to be added by Shivi
numerical_columns = ['hi_flow_lanes', 'hi_flow_rv_lanes', 'def',
'hi_flow_lanes_fueling_positions', 'hi_flow_lanes_layout',
'hi_flow_lanes_stack_type', 'diesel']
subset_df = df_merged[numerical_columns]
correlation_matrix = subset_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
numerical_columns = [ 'ethanol_free', 'rv_lanes', 'rv_dumps',
'rv_lanes_fueling_positions', 'rv_lanes_layout',
'rv_lanes_stack_type', 'unleaded']
subset_df = df_merged[numerical_columns]
correlation_matrix = subset_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
columns_to_drop = ['x1_mile_income_bin', 'x1_2_mile_income_bin', 'x5_min_inc_bin', 'x7_min_inc_bin']
df_merged = df_merged.drop(columns=columns_to_drop)
df_merged['calendar.calendar_day_date'] = t_data.index
df_merged.set_index('calendar.calendar_day_date', inplace=True)
# Identifying categorical columns
categorical_columns = df_merged.select_dtypes(include=['object']).columns
# Applying one-hot encoding to categorical columns
encoded_df = pd.get_dummies(df_merged, columns=categorical_columns)
encoded_df.head()
average_sales = encoded_df['daily_yoy_ndt.total_food_service'].mean()
encoded_df['daily_yoy_ndt.total_food_service'].plot(style=".", figsize=(15,5), title="Food Sale over the Years")
plt.axhline(y=average_sales, color='r', linestyle='-', label='Average Sales')
plt.legend()
plt.show()
#splitting train as date less than 1st jan 2023 and test as later
train= encoded_df['daily_yoy_ndt.total_food_service'].loc[encoded_df.index< '2023-01-01']
test= encoded_df['daily_yoy_ndt.total_food_service'].loc[encoded_df.index>= '2023-01-01']
fig, ax= plt.subplots(figsize=(15,5))
train.plot(ax=ax, label='training set')
test.plot(ax=ax, label='Test set')
ax.axvline('2023-01-01',color='black',ls='--')
ax.legend(['Training Set','Test Set'])
plt.show()
# Features (X) and Target (y)
X = encoded_df.drop(columns=['daily_yoy_ndt.total_food_service'])
y = encoded_df['daily_yoy_ndt.total_food_service']
train_end_date = '2023-01-01'
X_train = X[X.index < train_end_date]
X_test = X[X.index >= train_end_date]
y_train = y[y.index < train_end_date]
y_test = y[y.index >= train_end_date]
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1000,
early_stopping_rounds=50,
objective='reg:linear',
max_depth=3,
learning_rate=0.01)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
fi = pd.DataFrame(data=reg.feature_importances_,
index=reg.feature_names_in_,
columns=['importance'])
top_10_features = fi.sort_values(by='importance', ascending=False).head(10)
top_10_features.plot(kind='barh', title='Top 10 Feature Importance')
train = encoded_df[['daily_yoy_ndt.total_food_service','x1_2_mile_pop','x7_min_pop','x1_mile_pop','womens_sink_count','hi_flow_lanes','square_feet','x1_2_mile_emp','x5_min_pop']].loc[encoded_df.index < '2023-01-01']
test = encoded_df[['daily_yoy_ndt.total_food_service','x1_2_mile_pop','x7_min_pop','x1_mile_pop','womens_sink_count','hi_flow_lanes','square_feet','x1_2_mile_emp','x5_min_pop']].loc[encoded_df.index >= '2023-01-01']
fig, ax = plt.subplots(figsize=(15, 5))
train['daily_yoy_ndt.total_food_service'].plot(ax=ax, label='Training Set', title='Data Train/Test Split')
test['daily_yoy_ndt.total_food_service'].plot(ax=ax, label='Test Set')
ax.axvline('2023-01-01', color='black', ls='--')
ax.legend(['Training Set', 'Test Set'])
plt.show()
tss = TimeSeriesSplit(n_splits=5)
df = encoded_df[['daily_yoy_ndt.total_food_service','x1_2_mile_pop','x7_min_pop','x1_mile_pop','womens_sink_count','hi_flow_lanes','square_feet','x1_2_mile_emp','x5_min_pop']].sort_index()
fig, axs = plt.subplots(5, 1, figsize=(15, 15), sharex=True)
fold = 0
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx]
test = df.iloc[val_idx]
train['daily_yoy_ndt.total_food_service'].plot(ax=axs[fold],
label='Training Set',
title=f'Data Train/Test Split Fold {fold}')
test['daily_yoy_ndt.total_food_service'].plot(ax=axs[fold],
label='Test Set')
axs[fold].axvline(test.index.min(), color='black', ls='--')
fold += 1
plt.show()
def create_features(df):
df = df.copy()
df['dayofweek'] = df.index.dayofweek
df['quarter'] = df.index.quarter
df['month'] = df.index.month
df['year'] = df.index.year
df['dayofyear'] = df.index.dayofyear
df['dayofmonth'] = df.index.day
return df
df = create_features(df)
df.head()
def add_lags(df):
target_map= df['daily_yoy_ndt.total_food_service'].to_dict()
df['lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
return df
df = add_lags(df)
df.tail()
tss = TimeSeriesSplit(n_splits=5)
df = df.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx]
test = df.iloc[val_idx]
TARGET = 'daily_yoy_ndt.total_food_service'
X_train = train
y_train = train[TARGET]
X_test = test
y_test = test[TARGET]
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1000,
early_stopping_rounds=50,
objective='reg:linear',
max_depth=3,
learning_rate=0.01)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
y_pred = reg.predict(X_test)
preds.append(y_pred)
score = np.sqrt(mean_squared_error(y_test, y_pred))
scores.append(score)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print the evaluation metrics
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R-squared (R2) Score: {r2:.2f}')
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
future = pd.date_range('2023-08-17','2024-12-31', freq='D')
future_df = pd.DataFrame(index=future)
df['isFuture'] = False
TARGET = 'daily_yoy_ndt.total_food_service'
# Define features (excluding the target and 'isFuture' column)
FEATURES = ['x1_2_mile_pop','x7_min_pop','x1_mile_pop','womens_sink_count','hi_flow_lanes','square_feet','x1_2_mile_emp','x5_min_pop',
'dayofweek', 'quarter', 'month', 'year', 'dayofyear',
'dayofmonth', 'lag1','isFuture']
# Split the data into features (X) and target (y)
X = df[FEATURES]
y = df[TARGET]
# Create the XGBoost model
reg = xgb.XGBRegressor(base_score=0.5,
booster='gbtree',
n_estimators=500,
objective='reg:linear',
max_depth=3,
learning_rate=0.01)
# Fit the model
reg.fit(X, y, eval_set=[(X, y)], verbose=100)
future = pd.date_range('2023-08-17','2024-12-31', freq='D')
future_df = pd.DataFrame(index=future)
future_df['isFuture'] = True
df['isFuture'] = False
df_and_future = pd.concat([df, future_df])
df_and_future = create_features(df_and_future)
df_and_future = add_lags(df_and_future)
future_w_features = df_and_future.query('isFuture').copy()
future_w_features.head()
future_w_features = future_w_features.drop('daily_yoy_ndt.total_food_service', axis=1)
future_w_features['pred'] = reg.predict(future_w_features)
future_w_features.index = pd.to_datetime(future_w_features.index) # Make sure index is datetime
plt.figure(figsize=(10, 5))
plt.plot(future_w_features.index, future_w_features['pred'], color='r', marker='o', linestyle='-', linewidth=1, markersize=3)
plt.title('Future Predictions For Food Sales ')
plt.xlabel('Date')
plt.ylabel('Inside Sales')
plt.grid(True)
plt.show()
# Plotting historical data
plt.figure(figsize=(15, 5))
# Plot the entire historical data
encoded_df['daily_yoy_ndt.total_food_service'].plot(label='Historical Data', color='b')
plt.axvline('2023-08-17', color='black', ls='--')
# Plot future predictions
plt.plot(future_w_features.index, future_w_features['pred'], label='Future Predictions', color='red', linestyle='dashed')
# Set title and labels
plt.title('Historical Data and Future Predictions of Total Food Sales')
plt.xlabel('Date')
plt.ylabel('Value')
# Add legend
plt.legend()
# Show plot
plt.show()
start_date = '2023-08-17'
end_date = pd.to_datetime(start_date) + pd.DateOffset(days=502)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
# Create a DataFrame for the forecasted sales
forecasted_sales_df = pd.DataFrame({'Date': date_range, 'Sales': future_w_features['pred']})
# Display the DataFrame
print(forecasted_sales_df)
average_sales = encoded_df['daily_yoy_ndt.total_inside_sales'].mean()
encoded_df['daily_yoy_ndt.total_inside_sales'].plot(style=".", figsize=(15,5), title="Diesel Sale over the Years")
plt.axhline(y=average_sales, color='r', linestyle='-', label='Average Sales')
plt.legend()
plt.show()
train= encoded_df['daily_yoy_ndt.total_inside_sales'].loc[encoded_df.index< '2023-01-01']
test= encoded_df['daily_yoy_ndt.total_inside_sales'].loc[encoded_df.index>= '2023-01-01']
fig, ax= plt.subplots(figsize=(15,5))
train.plot(ax=ax, label='training set')
test.plot(ax=ax, label='Test set')
ax.axvline('2023-01-01',color='black',ls='--')
ax.legend(['Training Set','Test Set'])
plt.show()
# Features (X) and Target (y)
X = encoded_df.drop(columns=['daily_yoy_ndt.total_inside_sales'])
y = encoded_df['daily_yoy_ndt.total_inside_sales']
# Split data into training and testing sets
train_end_date = '2023-01-01'
X_train = X[X.index < train_end_date]
X_test = X[X.index >= train_end_date]
y_train = y[y.index < train_end_date]
y_test = y[y.index >= train_end_date]
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1000,
early_stopping_rounds=50,
objective='reg:linear',
max_depth=3,
learning_rate=0.01)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
fi = pd.DataFrame(data=reg.feature_importances_,
index=reg.feature_names_in_,
columns=['importance'])
top_10_features = fi.sort_values(by='importance', ascending=False).head(10)
top_10_features.plot(kind='barh', title='Top 10 Feature Importance')
train = encoded_df[['daily_yoy_ndt.total_inside_sales','x5_min_pop','x1_2_mile_emp','square_feet','hi_flow_lanes','womens_sink_count','x1_mile_pop','x7_min_pop','x1_2_mile_pop']].loc[encoded_df.index < '2023-01-01']
test = encoded_df[['daily_yoy_ndt.total_inside_sales','x5_min_pop','x1_2_mile_emp','square_feet','hi_flow_lanes','womens_sink_count','x1_mile_pop','x7_min_pop','x1_2_mile_pop']].loc[encoded_df.index >= '2023-01-01']
fig, ax = plt.subplots(figsize=(15, 5))
train['daily_yoy_ndt.total_inside_sales'].plot(ax=ax, label='Training Set', title='Data Train/Test Split')
test['daily_yoy_ndt.total_inside_sales'].plot(ax=ax, label='Test Set')
ax.axvline('2023-01-01', color='black', ls='--')
ax.legend(['Training Set', 'Test Set'])
plt.show()
tss = TimeSeriesSplit(n_splits=5)
df = encoded_df[['daily_yoy_ndt.total_inside_sales','x5_min_pop','x1_2_mile_emp','square_feet','hi_flow_lanes','womens_sink_count','x1_mile_pop','x7_min_pop','x1_2_mile_pop']].sort_index()
fig, axs = plt.subplots(5, 1, figsize=(15, 15), sharex=True)
fold = 0
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx]
test = df.iloc[val_idx]
train['daily_yoy_ndt.total_inside_sales'].plot(ax=axs[fold],
label='Training Set',
title=f'Data Train/Test Split Fold {fold}')
test['daily_yoy_ndt.total_inside_sales'].plot(ax=axs[fold],
label='Test Set')
axs[fold].axvline(test.index.min(), color='black', ls='--')
fold += 1
plt.show()
def create_features(df):
df = df.copy()
df['dayofweek'] = df.index.dayofweek
df['quarter'] = df.index.quarter
df['month'] = df.index.month
df['year'] = df.index.year
df['dayofyear'] = df.index.dayofyear
df['dayofmonth'] = df.index.day
return df
df = create_features(df)
def add_lags(df):
target_map= df['daily_yoy_ndt.total_inside_sales'].to_dict()
df['lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
return df
df = add_lags(df)
df.tail()
tss = TimeSeriesSplit(n_splits=5)
df = df.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx]
test = df.iloc[val_idx]
TARGET = 'daily_yoy_ndt.total_inside_sales'
X_train = train
y_train = train[TARGET]
X_test = test
y_test = test[TARGET]
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1000,
early_stopping_rounds=50,
objective='reg:linear',
max_depth=3,
learning_rate=0.01)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
y_pred = reg.predict(X_test)
preds.append(y_pred)
score = np.sqrt(mean_squared_error(y_test, y_pred))
scores.append(score)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print the evaluation metrics
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R-squared (R2) Score: {r2:.2f}')
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
future = pd.date_range('2023-08-17','2024-12-31', freq='D')
future_df = pd.DataFrame(index=future)
df['isFuture'] = False
TARGET = 'daily_yoy_ndt.total_inside_sales'
# Define features (excluding the target and 'isFuture' column)
FEATURES = ['x5_min_pop', 'x1_2_mile_emp', 'square_feet', 'hi_flow_lanes',
'womens_sink_count', 'x1_mile_pop', 'x7_min_pop', 'x1_2_mile_pop',
'dayofweek', 'quarter', 'month', 'year', 'dayofyear',
'dayofmonth', 'lag1', 'isFuture']
# Split the data into features (X) and target (y)
X = df[FEATURES]
y = df[TARGET]
# Create the XGBoost model
reg = xgb.XGBRegressor(base_score=0.5,
booster='gbtree',
n_estimators=500,
objective='reg:linear',
max_depth=3,
learning_rate=0.01)
# Fit the model
reg.fit(X, y, eval_set=[(X, y)], verbose=100)
future = pd.date_range('2023-08-17','2024-12-31', freq='D')
future_df = pd.DataFrame(index=future)
future_df['isFuture'] = True
df['isFuture'] = False
df_and_future = pd.concat([df, future_df])
df_and_future = create_features(df_and_future)
df_and_future = add_lags(df_and_future)
future_w_features = df_and_future.query('isFuture').copy()
future_w_features.head()
future_w_features = future_w_features.drop('daily_yoy_ndt.total_inside_sales', axis=1)
future_w_features['pred'] = reg.predict(future_w_features)
future_w_features.index = pd.to_datetime(future_w_features.index) # Make sure index is datetime
plt.figure(figsize=(10, 5))
plt.plot(future_w_features.index, future_w_features['pred'], color='r', marker='o', linestyle='-', linewidth=1, markersize=3)
plt.title('Future Predictions For Inside Sales ')
plt.xlabel('Date')
plt.ylabel('Inside Sales')
plt.grid(True)
plt.show()
plt.figure(figsize=(15, 5))
# Plot the entire historical data
encoded_df['daily_yoy_ndt.total_inside_sales'].plot(label='Historical Data', color='b')
plt.axvline('2023-08-01', color='black', ls='--')
# Plot future predictions
plt.plot(future_w_features.index, future_w_features['pred'], label='Future Predictions', color='red', linestyle='dashed')
# Set title and labels
plt.title('Historical Data and Future Predictions of Total Inside Sales')
plt.xlabel('Date')
plt.ylabel('Value')
# Add legend
plt.legend()
# Show plot
plt.show()
start_date = '2023-08-17'
end_date = pd.to_datetime(start_date) + pd.DateOffset(days=502)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
# Create a DataFrame for the forecasted sales
forecasted_sales_df = pd.DataFrame({'Date': date_range, 'Sales': future_w_features['pred']})
# Display the DataFrame
print(forecasted_sales_df)
average_sales = encoded_df['diesel'].mean()
encoded_df['diesel'].plot(style=".", figsize=(15,5), title="Diesel Sale over the Years")
plt.axhline(y=average_sales, color='r', linestyle='-', label='Average Sales')
plt.legend()
plt.show()
#splitting test as date less than 1st jan 2023 and train as later
train= encoded_df['diesel'].loc[encoded_df.index< '2023-01-01']
test= encoded_df['diesel'].loc[encoded_df.index>= '2023-01-01']
fig, ax= plt.subplots(figsize=(15,5))
train.plot(ax=ax, label='training set')
test.plot(ax=ax, label='Test set')
ax.axvline('2023-01-01',color='black',ls='--')
ax.legend(['Training Set','Test Set'])
plt.show()
# Features (X) and Target (y)
X = encoded_df.drop(columns=['diesel'])
y = encoded_df['diesel']
# Split data into training and testing sets
train_end_date = '2023-01-01'
X_train = X[X.index < train_end_date]
X_test = X[X.index >= train_end_date]
y_train = y[y.index < train_end_date]
y_test = y[y.index >= train_end_date]
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1000,
early_stopping_rounds=50,
objective='reg:linear',
max_depth=3,
learning_rate=0.01)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
fi = pd.DataFrame(data=reg.feature_importances_,
index=reg.feature_names_in_,
columns=['importance'])
top_10_features = fi.sort_values(by='importance', ascending=False).head(10)
top_10_features.plot(kind='barh', title='Top 10 Feature Importance')
train = encoded_df[['diesel','hi_flow_lanes_fueling_positions','x1_mile_emp',
'womens_toilet_count','daily_yoy_ndt.total_food_service','rv_lanes_fueling_positions','site_id_msba','traditional_forecourt_fueling_positions','x1_mile_pop']].loc[encoded_df.index < '2023-01-01']
test = encoded_df[['diesel','hi_flow_lanes_fueling_positions','x1_mile_emp',
'womens_toilet_count','daily_yoy_ndt.total_food_service','rv_lanes_fueling_positions','site_id_msba','traditional_forecourt_fueling_positions','x1_mile_pop']].loc[encoded_df.index >= '2023-01-01']
fig, ax = plt.subplots(figsize=(15, 5))
train['diesel'].plot(ax=ax, label='Training Set', title='Data Train/Test Split')
test['diesel'].plot(ax=ax, label='Test Set')
ax.axvline('2023-01-01', color='black', ls='--')
ax.legend(['Training Set', 'Test Set'])
plt.show()
tss = TimeSeriesSplit(n_splits=5)
df = encoded_df[['diesel','hi_flow_lanes_fueling_positions','x1_mile_emp',
'womens_toilet_count','daily_yoy_ndt.total_food_service','rv_lanes_fueling_positions','site_id_msba','traditional_forecourt_fueling_positions','x1_mile_pop']].sort_index()
fig, axs = plt.subplots(5, 1, figsize=(15, 15), sharex=True)
fold = 0
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx]
test = df.iloc[val_idx]
train['diesel'].plot(ax=axs[fold],
label='Training Set',
title=f'Data Train/Test Split Fold {fold}')
test['diesel'].plot(ax=axs[fold],
label='Test Set')
axs[fold].axvline(test.index.min(), color='black', ls='--')
fold += 1
plt.show()
def create_features(df):
df = df.copy()
df['dayofweek'] = df.index.dayofweek
df['quarter'] = df.index.quarter
df['month'] = df.index.month
df['year'] = df.index.year
df['dayofyear'] = df.index.dayofyear
df['dayofmonth'] = df.index.day
return df
df = create_features(df)
def add_lags(df):
target_map= df['diesel'].to_dict()
df['lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
return df
df = add_lags(df)
df.tail()
tss = TimeSeriesSplit(n_splits=5)
df = df.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx]
test = df.iloc[val_idx]
TARGET = 'diesel'
X_train = train
y_train = train[TARGET]
X_test = test
y_test = test[TARGET]
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1000,
early_stopping_rounds=50,
objective='reg:linear',
max_depth=3,
learning_rate=0.01)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
y_pred = reg.predict(X_test)
preds.append(y_pred)
score = np.sqrt(mean_squared_error(y_test, y_pred))
scores.append(score)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print the evaluation metrics
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R-squared (R2) Score: {r2:.2f}')
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
future = pd.date_range('2023-08-01','2024-12-31', freq='D')
future_df = pd.DataFrame(index=future)
df['isFuture'] = False
# Define your target variable
TARGET = 'diesel'
# Define features (excluding the target and 'isFuture' column)
FEATURES = ['hi_flow_lanes_fueling_positions','x1_mile_emp',
'womens_toilet_count','daily_yoy_ndt.total_food_service','rv_lanes_fueling_positions','site_id_msba','traditional_forecourt_fueling_positions','x1_mile_pop',
'dayofweek', 'quarter', 'month', 'year', 'dayofyear',
'dayofmonth','lag1', 'isFuture']
# Split the data into features (X) and target (y)
X = df[FEATURES]
y = df[TARGET]
# Create the XGBoost model
reg = xgb.XGBRegressor(base_score=0.5,
booster='gbtree',
n_estimators=500,
objective='reg:linear',
max_depth=3,
learning_rate=0.01)
# Fit the model
reg.fit(X, y, eval_set=[(X, y)], verbose=100)
future = pd.date_range('2023-08-17','2024-12-31', freq='D')
future_df = pd.DataFrame(index=future)
future_df['isFuture'] = True
df['isFuture'] = False
df_and_future = pd.concat([df, future_df])
df_and_future = create_features(df_and_future)
df_and_future = add_lags(df_and_future)
future_w_features = df_and_future.query('isFuture').copy()
future_w_features.head()
future_w_features = future_w_features.drop('diesel', axis=1)
future_w_features['pred'] = reg.predict(future_w_features)
future_w_features.index = pd.to_datetime(future_w_features.index) # Make sure index is datetime
plt.figure(figsize=(10, 5))
plt.plot(future_w_features.index, future_w_features['pred'], color='r', marker='o', linestyle='-', linewidth=1, markersize=3)
plt.title('Future Predictions For Diesel ')
plt.xlabel('Date')
plt.ylabel('Diesel')
plt.grid(True)
plt.show()
plt.figure(figsize=(15, 5))
encoded_df['diesel'].plot(label='Historical Data', color='b')
plt.axvline('2023-08-01', color='black', ls='--')
# Plot future predictions
plt.plot(future_w_features.index, future_w_features['pred'], label='Future Predictions', color='red', linestyle='dashed')
# Set title and labels
plt.title('Historical Data and Future Predictions of Diesel Sales')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.show()
start_date = '2023-08-17'
end_date = pd.to_datetime(start_date) + pd.DateOffset(days=502)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
# Create a DataFrame for the forecasted sales
forecasted_sales_df = pd.DataFrame({'Date': date_range, 'Sales': future_w_features['pred']})
# Display the DataFrame
print(forecasted_sales_df)
average_sales = encoded_df['unleaded'].mean()
encoded_df['unleaded'].plot(style=".", figsize=(15,5), title="Unleaded Sale over the Years")
plt.axhline(y=average_sales, color='r', linestyle='-', label='Average Sales')
plt.legend()
plt.show()
train= encoded_df['unleaded'].loc[encoded_df.index< '2023-01-01']
test= encoded_df['unleaded'].loc[encoded_df.index>= '2023-01-01']
fig, ax= plt.subplots(figsize=(15,5))
train.plot(ax=ax, label='training set')
test.plot(ax=ax, label='Test set')
ax.axvline('2023-01-01',color='black',ls='--')
ax.legend(['Training Set','Test Set'])
plt.show()
X = encoded_df.drop(columns=['unleaded'])
y = encoded_df['unleaded']
# Split data into training and testing sets
train_end_date = '2023-01-01'
X_train = X[X.index < train_end_date]
X_test = X[X.index >= train_end_date]
y_train = y[y.index < train_end_date]
y_test = y[y.index >= train_end_date]
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1000,
early_stopping_rounds=50,
objective='reg:linear',
max_depth=3,
learning_rate=0.01)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
fi = pd.DataFrame(data=reg.feature_importances_,
index=reg.feature_names_in_,
columns=['importance'])
top_10_features = fi.sort_values(by='importance', ascending=False).head(10)
top_10_features.plot(kind='barh', title='Top 10 Feature Importance')
train = encoded_df[['unleaded','mens_toilet_count','x1_mile_income', 'traditional_forecourt_stack_type','square_feet','lottery','years_since_last_project','x7_min_emp','total_sales']].loc[encoded_df.index < '2023-01-01']
test = encoded_df[['unleaded','mens_toilet_count','x1_mile_income', 'traditional_forecourt_stack_type','square_feet','lottery','years_since_last_project','x7_min_emp','total_sales']].loc[encoded_df.index >= '2023-01-01']
fig, ax = plt.subplots(figsize=(15, 5))
train['unleaded'].plot(ax=ax, label='Training Set', title='Data Train/Test Split')
test['unleaded'].plot(ax=ax, label='Test Set')
ax.axvline('2023-01-01', color='black', ls='--')
ax.legend(['Training Set', 'Test Set'])
plt.show()
tss = TimeSeriesSplit(n_splits=5)
df = encoded_df[['unleaded','mens_toilet_count','x1_mile_income', 'traditional_forecourt_stack_type','square_feet','lottery','years_since_last_project','x7_min_emp','total_sales']].sort_index()
fig, axs = plt.subplots(5, 1, figsize=(15, 15), sharex=True)
fold = 0
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx]
test = df.iloc[val_idx]
train['unleaded'].plot(ax=axs[fold],
label='Training Set',
title=f'Data Train/Test Split Fold {fold}')
test['unleaded'].plot(ax=axs[fold],
label='Test Set')
axs[fold].axvline(test.index.min(), color='black', ls='--')
fold += 1
plt.show()
def create_features(df):
df = df.copy()
df['dayofweek'] = df.index.dayofweek
df['quarter'] = df.index.quarter
df['month'] = df.index.month
df['year'] = df.index.year
df['dayofyear'] = df.index.dayofyear
df['dayofmonth'] = df.index.day
return df
df = create_features(df)
def add_lags(df):
target_map= df['unleaded'].to_dict()
df['lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
return df
df = add_lags(df)
df.tail()
tss = TimeSeriesSplit(n_splits=5)
df = df.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx]
test = df.iloc[val_idx]
TARGET = 'unleaded'
X_train = train
y_train = train[TARGET]
X_test = test
y_test = test[TARGET]
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1000,
early_stopping_rounds=50,
objective='reg:linear',
max_depth=3,
learning_rate=0.01)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
y_pred = reg.predict(X_test)
preds.append(y_pred)
score = np.sqrt(mean_squared_error(y_test, y_pred))
scores.append(score)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R-squared (R2) Score: {r2:.2f}')
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
future = pd.date_range('2023-08-01','2024-12-31', freq='D')
future_df = pd.DataFrame(index=future)
df['isFuture'] = False
TARGET = 'unleaded'
FEATURES = ['mens_toilet_count','x1_mile_income', 'traditional_forecourt_stack_type','square_feet','lottery','years_since_last_project','x7_min_emp','total_sales',
'dayofweek', 'quarter', 'month', 'year', 'dayofyear',
'dayofmonth', 'lag1', 'isFuture']
X = df[FEATURES]
y = df[TARGET]
reg = xgb.XGBRegressor(base_score=0.5,
booster='gbtree',
n_estimators=500,
objective='reg:linear',
max_depth=3,
learning_rate=0.01)
reg.fit(X, y, eval_set=[(X, y)], verbose=100)
future = pd.date_range('2023-08-17','2024-12-31', freq='D')
future_df = pd.DataFrame(index=future)
future_df['isFuture'] = True
df['isFuture'] = False
df_and_future = pd.concat([df, future_df])
df_and_future = create_features(df_and_future)
df_and_future = add_lags(df_and_future)
future_w_features = df_and_future.query('isFuture').copy()
future_w_features.head()
future_w_features = future_w_features.drop('unleaded', axis=1)
future_w_features['pred'] = reg.predict(future_w_features)
future_w_features.index = pd.to_datetime(future_w_features.index) # Make sure index is datetime
plt.figure(figsize=(10, 5))
plt.plot(future_w_features.index, future_w_features['pred'], color='r', marker='o', linestyle='-', linewidth=1, markersize=3)
plt.title('Future Predictions For Unleaded ')
plt.xlabel('Date')
plt.ylabel('Unleaded')
plt.grid(True)
plt.show()
plt.figure(figsize=(15, 5))
encoded_df['unleaded'].plot(label='Historical Data', color='b')
plt.axvline('2023-08-17', color='black', ls='--')
plt.plot(future_w_features.index, future_w_features['pred'], label='Future Predictions', color='red', linestyle='dashed')
plt.title('Historical Data and Future Predictions of Unleaded Sales')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.show()
start_date = '2023-08-17'
end_date = pd.to_datetime(start_date) + pd.DateOffset(days=502)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
forecasted_sales_df = pd.DataFrame({'Date': date_range, 'Sales': future_w_features['pred']})
print(forecasted_sales_df)
#!pip install pystan==2.19.1.1
#pip install --upgrade numpy
from prophet import Prophet
df_merged['calendar.calendar_day_date'] = t_data.index
df_merged.set_index('calendar.calendar_day_date', inplace=True)
import pandas as pd
categorical_columns = df_merged.select_dtypes(include=['object']).columns
encoded_df = pd.get_dummies(df_merged, columns=categorical_columns)
encoded_df.head()
split_date = '2023-01-01'
inside_train = encoded_df['daily_yoy_ndt.total_inside_sales'].loc[encoded_df.index <= split_date].copy()
inside_test = encoded_df['daily_yoy_ndt.total_inside_sales'].loc[encoded_df.index > split_date].copy()
# Plot train and test so you can see where we have split
(inside_test
.rename_axis('Date')
.rename('TEST SET')
.to_frame()
.join(inside_train.rename_axis('Date').rename('TRAINING SET').to_frame(),
how='outer')
.plot(figsize=(10, 5), title='Inside Sales', style='.', ms=2)
)
plt.show()
inside_train_prophet= inside_train.reset_index().rename(columns={'calendar.calendar_day_date':'ds','daily_yoy_ndt.total_inside_sales':'y'})
inside_train_prophet.head()
model= Prophet()
model.fit(inside_train_prophet)
inside_test_prophet= inside_test.reset_index().rename(columns={'calendar.calendar_day_date':'ds','daily_yoy_ndt.total_inside_sales':'y'})
inside_sales_tst_forecast= model.predict(inside_test_prophet)
inside_sales_tst_forecast.head()
fig, ax = plt.subplots(figsize=(10, 5))
fig = model.plot(inside_sales_tst_forecast, ax=ax)
plt.show()
fig = model.plot_components(inside_sales_tst_forecast)
plt.show()
f, ax = plt.subplots(figsize=(15,5))
ax.scatter(inside_test.index, inside_test, color='#FFB6C1')
fig = model.plot(inside_sales_tst_forecast, ax=ax)
f, ax = plt.subplots(figsize=(15,5))
ax.scatter(inside_test.index, inside_test, color='r')
fig = model.plot(inside_sales_tst_forecast, ax=ax)
ax.set_xlim(left=pd.to_datetime('2023-01-01'), right=pd.to_datetime('2023-01-31'))
ax.set_ylim(bottom=0, top=6000)
plt.suptitle('January 2023 Forecast vs Actual')
plt.show()
#Jan 2023 first week forecasts in detail
f, ax = plt.subplots(figsize=(15,5))
ax.scatter(inside_test.index, inside_test, color='r')
fig = model.plot(inside_sales_tst_forecast, ax=ax)
ax.set_xlim(left=pd.to_datetime('2023-01-01'), right=pd.to_datetime('2023-01-07'))
ax.set_ylim(bottom=0, top=6000)
plt.suptitle('January 2023 Forecast vs Actual')
plt.show()
#Evaluate the model with error metrics
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_true=inside_test,y_pred=inside_sales_tst_forecast['yhat']))
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
mean_absolute_percentage_error(y_true=inside_test, y_pred=inside_sales_tst_forecast['yhat'])
future= model.make_future_dataframe(periods=730)
forecast=model.predict(future)
fig = model.plot(forecast, figsize=(15, 5))
plt.title('Prophet Forecast')
plt.xlabel('Date')
plt.ylabel('Value')
plt.show()
forecast[['ds','yhat']].tail(100)
XGBoost Model Performance Food Sales Mean Absolute Error (MAE): 2.51 Mean Squared Error (MSE): 34.41
Inside Sales Mean Absolute Error (MAE): 6.68 Mean Squared Error (MSE): 423.45
Diesel: Mean Absolute Error (MAE): 12.91 Mean Squared Error (MSE): 2676.26
Unleaded: Mean Absolute Error (MAE): 5.83 Mean Squared Error (MSE): 166.44
Krishna Chaitanya: XgBoost Model(Model built for Inside Sales), Correlation Matrix, Seasonality, Analysis of unleaded and diesel sales, Results
Anukriti Raj: Arimax Model, XgBoost Model(Model built for Food Sales) Feature Engineering ,Seasonality, Correlation Matrix,Analysis of unleaded and diesel sales, Results
Shivi Shrivastav: Introduction, XgBoost Model(Model built for Unleaded Sales) Table of Contents, Seasonality, Analysis of in-store and food sales sales, Results
Litzy Carbajal: Introduction, XgBoost Model(Model built for Diesel Sales) ,Seasonality, Analysis of in-store and food sales, Results